Model comparison (done correctly) helps to choose the model that provides a good representation of the true DGP by penalizing models that “overfit”. This penalization is achieved mainly by assessing “fit” not on a training data set, but on a hold out test data set.
A complementary approach to work against “overfitting” is to specify priors that shrink model coefficients towards zero. Such shrinkage priors are typically normally distributed, have a mean of zero and a relatively small standard deviation. Here relative refers to the scale on which a predictor is measured.
To visualize how shrinkage works, we generate a small data set for which we want to estimate the mean:
set.seed(5)
y = (rnorm(15) %>% scale()) + 1
hist(y)
We want a Bayesian estimate of the mean of y \(\small y \sim normal(mu, 1)\) with two
different priors \(\small mu \sim
normal(0,\sigma)\) for mu:
par(mar = c(5,3,.5,.1))
curve(dnorm(x,0,1), -10, 10, n = 500,
ylab = "density", xlab = "mu", col = "blue", lwd = 3)
curve(dnorm(x,0,3), n = 500, add = T,
ylab = "density", xlab = "mu", col = adjustcolor("blue",alpha = .5), lwd = 3)
legend("topleft",
lty = 1, lwd = 3,
col = c("blue",adjustcolor("blue",alpha = .5)),
legend = c("mu ~ normal(0,1)","mu ~ normal(0,3)"),
bty = "n")
Here, we calculate the likelihood and the prior probability of
different estimates for the mean of y
mus = seq(-1,3,.01)
P =
apply(
data.frame(mu = mus), 1,
function(mu) {
c(
mu = mu,
# P(data| mu)
llikelihood = sum(dnorm(y,mean = mu, log = T)),
# P(mu), mu ~ normal(0,1)
Psd1 = dnorm(mu,0, sd = 1, log = T),
# P(mu), mu ~ normal(0,3)
Psd3 = dnorm(mu,0, sd = 3, log = T)
)
}
)
P[,1:5]
## [,1] [,2] [,3] [,4] [,5]
## mu.mu -1.000000 -0.990000 -0.980000 -0.970000 -0.960000
## llikelihood -50.784078 -50.484828 -50.187078 -49.890828 -49.596078
## Psd1.mu -1.418939 -1.408989 -1.399139 -1.389389 -1.379739
## Psd3.mu -2.073106 -2.072001 -2.070906 -2.069823 -2.068751
Next, we calculate the posterior probabilities:
post1 = exp(P["llikelihood",] + P["Psd1.mu",])
post3 = exp(P["llikelihood",] + P["Psd3.mu",])
Now we can show likelihood, prior and posterior together:
par(mfrow = c(3,1), mar=c(2.75,2.75,0,.25), mgp=c(1.25,.1,0), tck=-.01)
plot(mus,exp(P["llikelihood",]),'l', xlab = "", ylab = "P(y|mu)", col = "red", lwd = 2)
curve(dnorm(x,0,1), min(mus), max(mus), n = 500,
ylab = "P(mu)", xlab = "", col = "blue", lwd = 3)
curve(dnorm(x,0,3), n = 500, add = T,
ylab = "density", xlab = "", col = adjustcolor("blue",alpha = .5), lwd = 3)
legend("topright",
lty = 1, lwd = 3,
col = c("blue",adjustcolor("blue",alpha = .5)),
legend = c("mu ~ normal(0,1)","mu ~ normal(0,3)"),
bty = "n")
plot(mus,post1,'l', xlab = "mu", ylab = "P(y|mu) * P(mu)", col = "purple", lwd = 2)
lines(mus,post3,'l', col = adjustcolor("purple",alpha = .5), lwd = 2)
abline(v = c(mus[which.max(post1)],mus[which.max(post3)]), lty = 3,
col = c("purple",adjustcolor("purple",alpha = .5)))
legend("topright",
lty = 1, lwd = 3,
col = c("purple",adjustcolor("purple",alpha = .5)),
legend = c("mu ~ normal(0,1)","mu ~ normal(0,3)"),
bty = "n")
We zoom in to see more clearly how the narrow prior shrinks the
estimated mu towards zero
par(mar = c(5,3,.5,.1))
ids = which(mus > 0 & mus < 1.5)
plot(mus[ids],post1[ids],'l', xlab = "mu", ylab = "P(y|mu) * P(mu)", col = "purple", lwd = 2)
lines(mus[ids],post3[ids],'l', col = adjustcolor("purple",alpha = .5), lwd = 2)
abline(v = c(mus[which.max(post1)],mus[which.max(post3)]), lty = 3,
col = c("purple",adjustcolor("purple",alpha = .5)))
arrows(x0 = mus[which.max(post3)],
x1 = mus[which.max(post1)],
y0 = mean(c(max(post1),max(post3))),
lwd = 3, length = .15)
legend("topright",
lty = 1, lwd = 3,
col = c("purple",adjustcolor("purple",alpha = .5)),
legend = c("mu ~ normal(0,1)","mu ~ normal(0,3)"),
bty = "n")
To show that shrinkage works, we estimate spline models with different standard deviations on regression coefficients for the simulated income / well being data above.
The following figure shows the estimated relationships for different samples drawn from the population.
Hopefully you can see that large deviations between the true DGP in red and the estimated DGP in blue are less frequent when the prior on regression coefficients is narrow (top left) compared to when it is wider (bottom right).
To confirm this, the following plot shows deviances from a simulation that - samples 1000 times training and test data, - estimates model parameters on the training set - calculates deviances for the training and test data sets.
The following figure shows deviances +/- 1 sd.
load("sim_lppd.Rdata")
plot_deviances = function() {
D.test = -2*elpd.test
D.train = -2*elpd.train
m.test = colMeans(D.test)
m.train = colMeans(D.train)
within_sd = function(m) {
apply(apply(m,2, function(x) x - rowMeans(m)), 2, sd)
}
lower.test = m.test - within_sd(elpd.test)
upper.test = m.test + within_sd(elpd.test)
lower.train = m.train - within_sd(elpd.train)
upper.train = m.train + within_sd(elpd.train)
par(mfrow = c(2,1), mar=c(2.5,2.5,.5,.5), mgp=c(1,.1,0), tck=-.01)
ylim = range(c(lower.train, upper.train))
xs = 1:ncol(elpd.test)
plot(xs,m.train, ylim = ylim,
ylab = "Deviance train", xlab = "", xaxt = "n")
axis(1, at = 1:5, labels = c(1,2,5,10,20))
arrows(xs,y0 = lower.train, y1 = upper.train, length = 0)
ylim = range(c(lower.test, upper.test))
plot(xs+.2, m.test, col = "blue", pch = 16, ylim = ylim,
ylab = "Deviance test", xlab = "sd(b)", xaxt = "n")
arrows(xs+.2,y0 = lower.test, y1 = upper.test, length = 0, col = "blue")
axis(1, at = 1:6, labels = b.sds)
}
plot_deviances()
Indeed, we see that while models with wider priors on the regression
coefficients have a lower deviances in the training data, they have
larger deviances for the test data. This becomes also clear, if we plot
many estimated DGPs together. In the following figure, each blue lines
show the expected y values inferred from 100 random samples
and the red lines the true relationship between x an y.
So far we have implemented some simple cross validation manually, by simply simply splitting our total sample in half. However, data can also be split differently, e.g. 20% training data 80% test data or the other way around.
One popular way to split data to fit the data on \(\small N-1\) data points and to use the \(\small N\)th data point as test data. This is referred to as Leave one out cross validation or LOO-CV.
One thing that is easily implemented with LOO-CV is to calculated the deviance as the average deviance over the \(\small N\) LOO-CV deviances. for LOO-CV, there are always only N-1 training data sets.
This is different for alternative schemes. For instance, at a samples size of 20, there are 1.84756^{5} to construct the training data set. Here, averaging about all possible test-data deviances would be too computationally expensive.
The key thing to keep in mind when doing cross validation is that dependent observations (more specifically, observations with correlated errors) should always be kept in either test or training data sets. Such complications plays e.g. a role in time series or hierarchically organized data sets.
The computationally challenging part of LOO-CV is that one needs to estimate the model \(\small N-1\) times.
Fortunately, one can approximate LOO-CV with a method called pareto smoothed importance sampling (PSIS-LOO). Here, not all \(\small lppd\) receive the same weight when calculating the deviance, but
The goal of information criteria is to give us the information out of sample prediction (cross validation) gives us, without that we need to split the data into test and training set.
Information criteria are calculated from two quantities
All well known information criteria, like AIC, DIC, WAIC use the same deviance term, but they differ in their penalty:
Never heard of WAIC, why should you used it?
The benchmark is out of sample prediction or cross validation, where we
In the following figure from he book, these cross validation values are represented by dots (empty for flat priors black for shrinkage priors).
Lines in the figures are average values over 1000 simulations and show that on average information criteria, (LOO)-CV, PSIS-LOO and WAIC do a good job of approximating full cross validation.
When looking at averages, positive and negative differences to full cross validation can cancel. Therefore, it is good to also look at the average absolute, i.e. for each simulated data set we calculate the absolute of the differences between cross validation llpd and the approximation. This is shown in the following figure:
Bottom line: PSIS-LOO and WAIC are relatively close to full cross validation and can be computed relatively cheaply. In practice, it is good to use both and take discrepant results are a warning signal.
Information criteria are often used for model selection, but this is not the only way and should not even be the most important way to use the them.
More generally, we can think of two goals of model selection:
Model comparison should not be used for covariate selection when we want to estimate causal effects. When our interest is in causal effects, covariate selection should be determined based on the DAG1.
To illustrate why model comparison is not useful for covariate selection, we can go back to the example about the effect of treatment on well being:
source("../Chapter6/dag_utils.R")
par(mfrow = c(1,2))
M_dag = dagitty(
"dag{
M->W;
T->M;
T->W
}")
coord.list =
list(
x=c(T=0,W=2,M=1),
y=c(T=0,W=0,M=-1))
coordinates(M_dag) = coord.list
drawmydag(M_dag,cex = 2)
N = 150
set.seed(1)
T = rnorm(N)
M = T + rnorm(N)
W = 0.1*T + 0.5*M + rnorm(N)
grayblue = colorRampPalette(c("grey","blue"))
par(mar=c(2.5,2.5,.5,.5), mgp=c(1,.1,0), tck=-.01)
plot(W~T, col = grayblue(N)[rank(M)], pch = 16)
legend("topleft",pch = 16, col = c("gray","blue"),
legend = c("low M", "high M"), bty = "n")
Now we can estimate different models:
dt = list(T = T, M = M, W = W)
# adjustment for treatment (incorrect model)
W.TM = quap(
alist(
W ~ dnorm(mu, sigma),
mu <- a + bT*T + bM*M,
a ~ dnorm(0,1),
bT ~ dnorm(0,1),
bM ~ dnorm(0,1),
sigma ~ dexp(1)),
data = dt)
# no adjustment for treatment (correct model)
W.T = quap(
alist(
W ~ dnorm(mu, sigma),
mu <- a + bT*T,
a ~ dnorm(0,1),
bT ~ dnorm(0,1),
sigma ~ dexp(1)),
data = dt)
# effect of mediator (incorrect model)
W.M = quap(
alist(
W ~ dnorm(mu, sigma),
mu <- a + bM*M,
a ~ dnorm(0,1),
bM ~ dnorm(0,1),
sigma ~ dexp(1)),
data = dt)
# no effect (incorrect model)
W.a = quap(
alist(
W ~ dnorm(mu, sigma),
mu <- a,
a ~ dnorm(0,1),
sigma ~ dexp(1)),
data = dt)
To see the WAIC for the model W.TM we use the
WAIC function from the rethinking package.
WAIC(W.TM) %>% round(1)
## WAIC lppd penalty std_err
## 1 441.1 -216.4 4.2 17.4
To compare the models, we use the compare function:
compare(W.TM, W.T, W.M, W.a) %>% round(1)
compare(W.TM, W.T, W.M, W.a) %>%
data.frame() %>%
kable(digits = 2) %>%
kable_styling(full_width = F)
| WAIC | SE | dWAIC | dSE | pWAIC | weight | |
|---|---|---|---|---|---|---|
| W.M | 439.79 | 17.51 | 0.00 | NA | 3.23 | 0.67 |
| W.TM | 441.24 | 17.59 | 1.44 | 1.70 | 4.28 | 0.33 |
| W.T | 473.18 | 17.02 | 33.39 | 12.27 | 3.42 | 0.00 |
| W.a | 499.77 | 19.26 | 59.98 | 16.07 | 2.34 | 0.00 |
Here is an explanation of the columns:
We see that in a scenario where a good part of the treatment effect
was mediated by motivation (the DPG is M = T + rnorm(N),
W = 0.125*T + 0.5*M + rnorm(N) ) the model we should use to
estimate the treatment effect is not the best model, and even worse than
a model that only uses the mediator.
One important benefit of WAIC (and PSIS-LOO) is that they come with
standard error, so that we can compare differences in WAIC with our
uncertainty of the difference. Looking at the two best models, we see
that W.M is around 2.4 WIC better than W.TM
and that the standard error of this difference is 1.86. Given that WAIC
difference is firmly within two times dSE we cannot
conclude that the W.M model is certainly better than the
W.TM model. In comparison, the W.M model is
certainly better than the W.T model with a WIC differences
of 33.1 and a standard error of 12.21.
This more easily seen in a plot:
plot(compare(W.TM, W.T, W.M, W.a))
legend("topright",
pch = c(16,1,2),
legend = c("sample deviance",
"WAIC +/- 2*SE",
"WAIC +/- 2*dSE"),
bty = "n")
What this plot also shows is that if we wanted to compared models
W.T and W.a to decide if treatment was
effective, the standard errors of the difference show us that we would
not be very certain that the true model is better, even though the
treatment effect is clearly different from zero:
precis(W.T) %>% round(2)
## mean sd 5.5% 94.5%
## a 0.02 0.09 -0.13 0.17
## bT 0.58 0.10 0.41 0.74
## sigma 1.14 0.07 1.04 1.25
One intuition for this is that if the data are relatively noisy, model comparison techniques will have difficulties to be sure if one model is clearly better then others, even if within one model the effect of treatment is clear. Model comparison techniques only care about out of sample predictions and penalize for additional parameters. If due to noisy data adding a causal variable does not improve prediction by much, this model will not clearly outperform simpler models.
To correctly estimate effects it is not sufficient to choose the correct adjustment variables, but the functional form should also be (approximately?) correct.
Functional form refers to the for of the function that describes a relationship. The default for linear regression is a linear relationship, but we have seen that relation ships can be non-linear.
We have collected the data in the following plot.
N = 500
set.seed(1)
x = sort(rnorm(N,mean = 0))
bf.s = splines::bs(x,df = 3)
bf.b = matrix(c(1,1,1),ncol = 1)
mu = bf.s %*% bf.b
nn = 75
sidx = sample(N,nn)
y.n = rnorm(nn, mu[sidx] , sd = .1)
y = rstudent(nn, nu = 3, mu = mu[sidx], sigma = .1)
par(mfrow = c(1,1))
plot(x[sidx],y, xlab = "x", pch = 16)
#points(x[sidx],y.n, col = adjustcolor("black",alpha = .25))
#lines(x,mu, col = "red", lwd = 2)
x = x[sidx]
Now we wan to know if the relationship between x and y is linear, or non linear.
To do this, we fit a series of models of increasing complexity. To simplify this, we set up a basic model and define design matrices for different models:
quap.spline.model =
alist(
y ~ dnorm(mu,sigma),
mu <- X %*% b ,
b ~ dnorm(0,1),
sigma ~ dnorm(.25,.1)
)
library(splines)
B3 = bs(x, df = 3) %>% cbind(1)
B6 = bs(x, df = 6) %>% cbind(1)
B18 = bs(x, df = 18) %>% cbind(1)
LM = model.matrix(~ 0 + x) %>% cbind(1)
Before we fit the models, lets look at prior predictions for
mu:
par(mfrow = c(2,2), mar=c(2.5,2.5,.5,.5), mgp=c(1,.1,0), tck=-.01)
o = order(x)
pp = function(X) {
b = matrix(rnorm(ncol(X)*55),ncol = 55)
yhat = X %*% b
matplot(x[o],yhat[o,],type = 'l', col = "blue", lty = 1, xlab = "x", ylab = "x")
}
pp(LM)
pp(B3)
pp(B6)
pp(B18)
With the basic model with have set up, we can now easily fit the linear regression and the spline models.
q.B3 =
quap(quap.spline.model,
data = list(X = B3, y = y, x = x),
start = list(b=rep(0, ncol(B3)), sigma = .1))
q.B6 =
quap(quap.spline.model,
data = list(X = B6, y = y, x = x),
start = list(b=rep(0, ncol(B6)), sigma = .1))
q.B18 =
quap(quap.spline.model,
data = list(X = B18, y = y, x = x),
start = list(b=rep(0, ncol(B18)), sigma = .1))
q.LM =
quap(quap.spline.model,
data = list(X = LM, y = y, x = x),
start = list(b=rep(0, ncol(LM)), sigma = .1))
# work around a bug
attr(q.B3,"nobs") = 50
attr(q.B6,"nobs") = 50
attr(q.B18,"nobs") = 50
attr(q.LM,"nobs") = 50
And we plot the model predictions:
par(mfrow = c(1,1), mar=c(3,3,.5,.5), mgp=c(1.25,.1,0), tck=-.01)
plot_preds = function(q.fit, add = FALSE, col = "blue") {
mu.mat = link(q.fit,n = 1e4)
if(is.list(mu.mat)) mu.mat = mu.mat[["mu"]]
mu = apply(mu.mat,2,mean)
mu.ci = apply(mu.mat,2,function(x) quantile(x,c(.05,.95)))
x = q.fit@data[["x"]]
o = order(x)
if (add == FALSE) {
plot(x,y, ylim = range(mu.ci))
}
lines(x[o],mu[o],'l', col = col)
shade(mu.ci[,o],x[o], col = adjustcolor(col,alpha = .25))
}
plot(x,y, ylim = c(0,1.35), pch = 16)
plot_preds(q.LM, add = TRUE, "orange")
plot_preds(q.B3, add = TRUE, "blue")
plot_preds(q.B6, add = TRUE, "purple")
plot_preds(q.B18, add = TRUE, "violet")
Unsurprisingly, more flexible models fit the data better. But which
model has the best estimated out of sample predictions? This time we use
PSIS-LOO to compare the models:
compare(q.B3,q.B6,q.B18,q.LM, func = "PSIS", n = 1e4) %>% round(2)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
## PSIS SE dPSIS dSE pPSIS weight
## q.B3 -67.94 27.51 0.00 NA 8.18 0.92
## q.LM -62.13 25.22 5.81 5.84 6.16 0.05
## q.B6 -61.09 27.11 6.84 2.83 11.79 0.03
## q.B18 -51.68 27.40 16.25 16.09 28.32 0.00
OK this warning tells us that there are some very influential data points that determine to a large degree the PSIS-LOO values. Even if we are tempted to compare models now, there is really no point because we can’t trust the PSIS-LOO values.
TO improve the situation. lets look which points influential. To do
this, we can use PSIS with the flag
pointwise == TRUE to get the parameter k for
each data point. “k” is
par(mfrow = c(2,2), mar=c(3,3,.5,.5), mgp=c(1.25,.1,0), tck=-.01)
plot_influencial = function(q.fit) {
plot(x,y, pch = 16)
plot_preds(q.fit, add = TRUE, "blue")
PSISLOO = PSIS(q.fit, pointwise = TRUE,n = 5e4)
points(x[which(PSISLOO$k > .7)],y[which(PSISLOO$k > .7)], pch = 16, col = "orange")
points(x[which(PSISLOO$k > 1)],y[which(PSISLOO$k > 1)], pch = 16, col = "red")
text(2,.2, paste("sigma = ",round(coef(q.fit)["sigma"],3)), pos = 2)
}
plot_influencial(q.LM)
## Some Pareto k values are high (>0.5). Set pointwise=TRUE to inspect individual points.
plot_influencial(q.B3)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
plot_influencial(q.B6)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
plot_influencial(q.B18)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
Note that more complex models have a higher number of influential
data points. The intuition here is that the flexibility of
muallows the error variance to be smaller. The price one
pays is that data points there are far away from mu become
very influential because the density fall off quickly.
The PSIS warnings are another instance of the fold theorem of statistical modelling which states that modeling and estimation problems are often a result of mis-specified statistical models.
The figure makes clear that the model does a bad job dealing with outliers. One method to deal with outliers and that comes under the name “robust regression” uses the student t distribution as an error model (instead of the normal distribution.):
curve(dnorm(x, mean = , sd = 1), -4, 4, n = 500, ylab = "density", lty = 3)
curve(dstudent(x, nu = 2, mu = 0, sigma = 1), -4, 4, n = 500, add = TRUE)
legend("topleft", lty = c(1,3),
legend = c("dnorm(0,1)","dstudent(0,2,1)"),
bty = "n")
Because the density of the student t distribution is lower close to
zero and higher longer away from zero, observed values that are far away
from mu will have a larger likelihood and will this be less
influential.
The next figure shows that by using a student t error distribution,
we can increase the likelihood of values far away from mu,
decrease the likelihod of values close to mu and thus
reduce the relative importance of the outlier values.
plot_influencial(q.B18)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
yhat = colMeans(link(q.B18,n = 1e4))
sigma = coef(q.B18)["sigma"]
PSISLOO = PSIS(q.B18, pointwise = TRUE, n = 5e4)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
xs = seq(-1,1,.01)
d = dnorm(xs,0,sigma)/4
ds = dstudent(xs,0,nu = 2, sigma)/4
for (j in which(PSISLOO$k > .7)) {
lines(d+x[j],xs+yhat[j], lty = 3)
lines(ds+x[j],xs+yhat[j])
lines(x = rep(x[j],2), y = c(min(xs), max(xs))+yhat[j], col = "grey")
}
Now lets re-estimate the data with a student-t error distribution. Here is the basic model.
quap.spline.model =
alist(
y ~ dstudent(nu, mu, sigma),
mu <- X %*% b ,
nu <- 1 + dexp(1),
b ~ dnorm(0,1),
sigma ~ dnorm(.25,.1)
)
q.B3 =
quap(quap.spline.model,
data = list(X = B3, y = y, x = x),
start = list(b=rep(0, ncol(B3)), sigma = .1))
q.B6 =
quap(quap.spline.model,
data = list(X = B6, y = y, x = x),
start = list(b=rep(0, ncol(B6)), sigma = .1))
q.B18 =
quap(quap.spline.model,
data = list(X = B18, y = y, x = x),
start = list(b=rep(0, ncol(B18)), sigma = .1))
q.LM =
quap(quap.spline.model,
data = list(X = LM, y = y, x = x),
start = list(b=rep(0, ncol(LM)), sigma = .1))
# work around a bug
attr(q.B3,"nobs") = 50
attr(q.B6,"nobs") = 50
attr(q.B18,"nobs") = 50
attr(q.LM,"nobs") = 50
And we compare the models again.
compare(q.B3,q.B6,q.B18,q.LM, func = "PSIS", n = 1e4) %>% round(2)
## PSIS SE dPSIS dSE pPSIS weight
## q.B18 -85.49 20.69 0.00 NA 19.43 0.98
## q.B3 -76.76 16.51 8.73 13.48 6.07 0.01
## q.B6 -74.74 17.06 10.75 13.03 8.20 0.00
## q.LM -70.66 17.06 14.82 16.16 3.35 0.00
This time there are no warnings!
Here are the new models predictions:
plot(x,y, ylim = c(0,1.35), pch = 16)
plot_preds(q.LM, add = TRUE, "orange")
plot_preds(q.B3, add = TRUE, "blue")
plot_preds(q.B6, add = TRUE, "purple")
plot_preds(q.B18, add = TRUE, "violet")
Admittedly, things can get more complicated if we are uncertain about which DAG is correct. Model selection criteria can help to select or weight DAGs↩︎